In [1]:
    
import scipy as sp
import matplotlib.pylab as plt
import seaborn as sea
import pandas as pd
%pylab inline
    
    
    
    
In [2]:
    
from example import read_games
recs = list(read_games())
    
In [3]:
    
from collections import defaultdict
topkeys = defaultdict(int)
for rec in recs:
    for k in rec['data'].keys():
        topkeys[k] += 1
print(list(sorted(topkeys.keys())))
    
    
In [4]:
    
for ky,tot in sorted(topkeys.items()):
    count = 0
    uniq = set()
    mn, mx = None, None
    
    for rec in recs:
        val = rec['data'].get(ky, None)
        if not val:
            continue
        count += 1
        if isinstance(val, dict):
            val = "<DICT>"
        elif isinstance(val, list):
            val = "<LIST>"
        else:
            val = str(val).strip()
        uniq.add(val)
        if mn is None or val < mn:
            mn = val
        if mx is None or val > mx:
            mx = val
    print("%-25s %8d %8d %20s %20s" % (ky, tot, len(uniq), str(mn)[:20], str(mx)[:20]))
    
    
In [5]:
    
from random import choice
from collections import Counter
from pprint import PrettyPrinter
pp = PrettyPrinter(width=120)
pprint = pp.pprint
def fields(name):
    for rec in recs:
        v = rec['data'].get(name, None)
        if v:
            yield v
            
field_values = list(fields('support_info'))
def out(v):
    pprint(v)
    print("")
out(field_values[0])
out(field_values[-1])
for _ in range(10):
    out(choice(field_values))
# def scan_values():
#     for fv in field_values:
#         free, purchase, subscript = False, False, False
#         for pg in fv:
#             if pg.get('is_recurring_subscription', '') == 'true':
#                 subscript = True
#             for s in pg.get('subs', list()):
#                 if s.get('is_free_license', None):
#                     free = True
#                 elif s.get('price_in_cents_with_discount', 0) > 0:
#                     purchase = True
#         yield (free, purchase, subscript)
# pprint(Counter(list(scan_values())))
           
# pprint(Counter([k for fv in field_values for k in fv.keys()]))
# pprint(Counter([i.get('description', 'MISSING').lower() for fv in field_values for i in fv]))
    
    
Each of these should be boolean column (from list of dict's, check 'description')
Like categories, it's a list of dicts. Use the dict['description'] for text and discretize:
If description is empty string, then ignore it.
All descriptions not described below are GenreIsOther
These descriptions become GenreIsXXX:
These description become GenreIsNotGame:
Applied to linux_requirements, mac_requirements, and pc_requirements
Value is dictionary with possible keys:
Turn into boolean, i.e.
3 boolean columns: FreeVerAvail, PurchaseAvail, SubscriptionAvail
Code for all:
def package_groups(rec):
    pgs = rec.get("data", dict()).get("package_groups", list)
    free, purchase, subscript = False, False, False
    for pg in pgs:
        if pg.get('is_recurring_subscription', '') == 'true':
            subscript = True
        for s in pg.get('subs', list()):
            if s.get('is_free_license', None):
                free = True
            elif s.get('price_in_cents_with_discount', 0) > 0:
                purchase = True
    return (free, purchase, subscript)
In [6]:
    
# Attempt to read our shiny CSV file
gf = pd.read_csv('games-features.csv')
print(gf.columns)
gf.head()
    
    
    Out[6]:
In [7]:
    
def invest(cname):
    print(cname, " > 0")
    nums = gf[gf[cname] > 0][cname]
    print(nums.describe())
    plt.figure(figsize=(8,6))
    sea.distplot(nums)
    print("")
invest("DemoCount")
invest("DLCCount")
invest("RecommendationCount")
    
    
    
    
    
In [9]:
    
gf["PCMinReqsText"]
    
    Out[9]: